import pandas as pd
import numpy as np

def get_click_uid():
    train_dir = "/home/datanfs/macong_data/tencent_data/train_preliminary/click_uid.csv"
    # ['user_id', 'age', 'gender', 'creative_id', 'time', 'click_times']
    train_data = pd.read_csv(train_dir, encoding="utf-8")

    vocab_count = {}
    max_length = 0

    # 统计词频
    for sen in train_data['creative_id']:
        words = eval(sen)
        max_length = max(max_length, len(words))
        for word in words:
            if type(word) == str:
                print("cid type is str error")
                return
            if word in vocab_count.keys():
                vocab_count[word] += 1
            else:
                vocab_count[word] = 1

    print("max length:", max_length)
    print("词典个数:", len(vocab_count))


def unit_test():
    print("### look click udi csv ###")
    get_click_uid()


if __name__ == '__main__':
    unit_test()
